from langchain_text_splitters import HTMLHeaderTextSplitter


# html_string = """
"""
<!DOCTYPE html>
<html>
<body>
    <div>
        <h1>标题1</h1>
        <p>关于标题1的一些介绍文本。</p>
        <div>
            <h2>子标题1</h2>
            <p>关于子标题1的一些介绍文本。</p>
            <h3>子子标题1</h3>
            <p>关于子子标题1的一些文本。</p>
            <h3>子子标题2</h3>
            <p>关于子子标题2的一些文本。</p>
        </div>
        <div>
            <h3>子标题2</h2>
            <p>关于子标题2的一些文本。</p>
        </div>
        <br>
        <p>关于标题1的一些结束文本。</p>
    </div>
</body>
</html>
"""


with open("./test.html",encoding="utf-8") as f:
    read_string = f.read()
headers_to_spliter_on = [
    ("h1","一级标题"),
    ("h2","二级标题"),
    ("h3","三级标题")
]

headers_to_spliter = HTMLHeaderTextSplitter(headers_to_spliter_on)
chuncks = headers_to_spliter.split_text(read_string)

for one in chuncks:
    print(one)